%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(0)
plt.style.use("ggplot")
import tensorflow as tf
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))
Tensorflow version: 2.3.0 GPU detected: []
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)
tags = list(set(data["Tag"].values))
num_tags = len(tags)
table1 = data['Tag'].groupby(data['Tag']).agg(['count'])
table1 = table1.iloc[:-1,:]
table1 = table1.reset_index()
table1
| Tag | count | |
|---|---|---|
| 0 | B-art | 402 |
| 1 | B-eve | 308 |
| 2 | B-geo | 37644 |
| 3 | B-gpe | 15870 |
| 4 | B-nat | 201 |
| 5 | B-org | 20143 |
| 6 | B-per | 16990 |
| 7 | B-tim | 20333 |
| 8 | I-art | 297 |
| 9 | I-eve | 253 |
| 10 | I-geo | 7414 |
| 11 | I-gpe | 198 |
| 12 | I-nat | 51 |
| 13 | I-org | 16784 |
| 14 | I-per | 17251 |
| 15 | I-tim | 6528 |
import plotly.express as px
fig = px.histogram(table1, x="Tag",y="count",color="Tag",title="Count of Tag in NER dataset")
fig.show()
import chart_studio
username = 'beiqizhou' # medium username
api_key = 'YJeWb4eTmUSJUEqyeIwg' # medium api key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
import chart_studio.plotly as py
py.plot(fig, filename = 'NER_dataset', auto_open=True)
'https://plotly.com/~beiqizhou/42/'
import tqdm
def sentence_integrate(data):
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
return data.groupby('Sentence #').apply(agg_func).tolist()
sentences=sentence_integrate(data)
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
table2 = pd.DataFrame([len(s) for s in sentences])
table2 = table2.rename(columns={0: "length"})
fig1 = px.histogram(table2,x="length",marginal='box',title="Count of Sentence Lengths")
fig1.show()
import chart_studio.plotly as py
py.plot(fig1, filename = 'NER_sentence', auto_open=True)
C:\Users\becky\anaconda3\lib\site-packages\chart_studio\plotly\plotly.py:222: UserWarning: Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions: (1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph. (2) Trying using the image API to return an image instead of a graph URL (3) Use matplotlib (4) See if you can create your visualization with fewer data points If the visualization you're using aggregates points (e.g., box plot, histogram, etc.) you can disregard this warning.
'https://plotly.com/~beiqizhou/45/'
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
#split into test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#build model
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras
model = keras.Sequential()
model.add(InputLayer((max_len)))
model.add(Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len))
model.add(SpatialDropout1D(0.1))
model.add( Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.compile(optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback
%%time
logdir="log/"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
chkpt = ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='max', baseline=None, restore_best_weights=False)
callbacks = [PlotLossesCallback(), chkpt, early_stopping,tensorboard_callback]
history = model.fit(
x=x_train,
y=y_train,
validation_data=(x_test,y_test),
batch_size=32,
epochs=10,
callbacks=callbacks,
verbose=1
)
accuracy training (min: 0.945, max: 0.985, cur: 0.985) validation (min: 0.969, max: 0.980, cur: 0.979) Loss training (min: 0.056, max: 0.311, cur: 0.056) validation (min: 0.091, max: 0.168, cur: 0.091) Epoch 00010: val_loss improved from 0.09195 to 0.09069, saving model to model_weights.h5 1199/1199 [==============================] - 61s 51ms/step - loss: 0.0562 - accuracy: 0.9855 - val_loss: 0.0907 - val_accuracy: 0.9791 Wall time: 10min 1s
table3 = pd.DataFrame(history.history)
table3 = table3.reset_index()
table3 = table3.rename(columns={"index": "epoches"})
table3
| epoches | loss | accuracy | val_loss | val_accuracy | |
|---|---|---|---|---|---|
| 0 | 0 | 0.311418 | 0.944632 | 0.168070 | 0.970817 |
| 1 | 1 | 0.144287 | 0.967048 | 0.121573 | 0.968609 |
| 2 | 2 | 0.096323 | 0.977821 | 0.101789 | 0.978603 |
| 3 | 3 | 0.078048 | 0.982856 | 0.100686 | 0.979627 |
| 4 | 4 | 0.069140 | 0.983831 | 0.099087 | 0.977544 |
| 5 | 5 | 0.064875 | 0.984508 | 0.100929 | 0.977358 |
| 6 | 6 | 0.065381 | 0.984388 | 0.091951 | 0.980108 |
| 7 | 7 | 0.075274 | 0.983051 | 0.099932 | 0.979568 |
| 8 | 8 | 0.061041 | 0.984867 | 0.096168 | 0.978899 |
| 9 | 9 | 0.056194 | 0.985478 | 0.090689 | 0.979060 |
y_accuracy = table3["accuracy"].to_list()
y_loss = table3["loss"].to_list()
y_val_loss = table3["val_loss"].to_list()
y_val_accuracy = table3["val_accuracy"].to_list()
x_value = table3["epoches"].to_list()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig2 = make_subplots(rows=1, cols=2)
fig2.add_trace(
go.Scatter(x=x_value, y=y_accuracy, name = "Accuracy"),
row=1, col=1
)
fig2.add_trace(
go.Scatter(x=x_value, y=y_val_accuracy, name = "Validation Accuracy"),
row=1, col=1
)
fig2.add_trace(
go.Scatter(x=x_value, y=y_loss, name = "Loss"),
row=1, col=2
)
fig2.add_trace(
go.Scatter(x=x_value, y=y_val_loss, name = "Validation Loss"),
row=1, col=2
)
# Update xaxis properties
fig2.update_xaxes(title_text="Number of Epochs", row=1, col=1)
fig2.update_xaxes(title_text="Number of Epochs", row=1, col=2)
# Update yaxis properties
fig2.update_yaxes(title_text="Accuracy %", row=1, col=1)
fig2.update_yaxes(title_text="Loss %", row=1, col=2)
fig2.update_layout(height=500, width=800, title_text="Model's Accuracy and Loss")
fig2.show()
import chart_studio.plotly as py
py.plot(fig2, filename = 'NER_model', auto_open=True)